#154 Improvements to website deduplication logic

as discussed with @cantino

Alex Piggott 11 years ago
parent
commit
b1898cc7ff
2 changed files with 50 additions and 6 deletions
  1. 37 5
      app/models/agents/website_agent.rb
  2. 13 1
      spec/models/agents/website_agent_spec.rb

+ 37 - 5
app/models/agents/website_agent.rb

@@ -36,7 +36,7 @@ module Agents
36 36
 
37 37
       Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent (only used to set the "working" status).
38 38
 
39
-      Set `uniqueness_look_back` (defaults to 10000) to limit the number of events checked for uniqueness (typically for performance).
39
+      Set `uniqueness_look_back` (defaults to the larger of 200, 3x the number of received events) to limit the number of events checked for uniqueness (typically for performance).
40 40
     MD
41 41
 
42 42
     event_description do
@@ -45,7 +45,8 @@ module Agents
45 45
 
46 46
     default_schedule "every_12h"
47 47
 
48
-    UNIQUENESS_LOOK_BACK = 10000
48
+    UNIQUENESS_LOOK_BACK = 200
49
+    UNIQUENESS_FACTOR = 3
49 50
 
50 51
     def working?
51 52
       event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
@@ -65,10 +66,32 @@ module Agents
65 66
     end
66 67
 
67 68
     def validate_options
69
+      # Check required fields are present
68 70
       errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
69 71
       if !options['extract'].present? && extraction_type != "json"
70 72
         errors.add(:base, "extract is required for all types except json")
71 73
       end
74
+      # Check options:
75
+      if options['mode'].present?
76
+        if options['mode'] != "on_change" && options['mode'] != "all"
77
+          errors.add(:base, "mode should be all or on_change")
78
+        end
79
+      end
80
+      # Check integer variables:      
81
+      if options['expected_update_period_in_days'].present?
82
+	      begin
83
+	      	Integer(options['expected_update_period_in_days'])
84
+	      rescue
85
+	      	errors.add(:base, "Invalid expected_update_period_in_days format")
86
+	      end
87
+	  end
88
+      if options['uniqueness_look_back'].present?
89
+	      begin
90
+	      	Integer(options['uniqueness_look_back'])
91
+	      rescue
92
+	      	errors.add(:base, "Invalid uniqueness_look_back format")
93
+	      end      
94
+      end
72 95
     end
73 96
 
74 97
     def check
@@ -84,9 +107,9 @@ module Agents
84 107
       end
85 108
       request.on_success do |response|
86 109
         doc = parse(response.body)
87
-        old_events = previous_payloads
88 110
 
89 111
         if extract_full_json?
112
+          old_events = previous_payloads 1
90 113
           result = doc
91 114
           if store_payload? old_events, result
92 115
             log "Storing new result for '#{name}': #{result.inspect}"
@@ -119,6 +142,7 @@ module Agents
119 142
             return
120 143
           end
121 144
       
145
+          old_events = previous_payloads num_unique_lengths.first
122 146
           num_unique_lengths.first.times do |index|
123 147
             result = {}
124 148
             options['extract'].keys.each do |name|
@@ -159,8 +183,16 @@ module Agents
159 183
       raise "Illegal options[mode]: " + options['mode'].to_s
160 184
     end
161 185
 
162
-    def previous_payloads
163
-      look_back = options['uniqueness_look_back'] ? options['uniqueness_look_back'].to_i : UNIQUENESS_LOOK_BACK
186
+    def previous_payloads(num_events)
187
+      if options['uniqueness_look_back'].present?
188
+	      look_back = options['uniqueness_look_back'].to_i
189
+      else
190
+      	# Larger of UNIQUENESS_FACTOR*num_events and UNIQUENESS_LOOK_BACK
191
+	    look_back = UNIQUENESS_FACTOR*num_events
192
+	    if look_back < UNIQUENESS_LOOK_BACK
193
+	    	look_back = UNIQUENESS_LOOK_BACK
194
+	    end
195
+      end
164 196
       events.order("id desc").limit(look_back) if options['mode'].to_s == "on_change"
165 197
     end
166 198
 

+ 13 - 1
spec/models/agents/website_agent_spec.rb

@@ -21,6 +21,18 @@ describe Agents::WebsiteAgent do
21 21
     end
22 22
 
23 23
     describe "#check" do
24
+    
25
+      it "should validate the integer fields" do
26
+        @checker.options['expected_update_period_in_days'] = "nonsense"
27
+        lambda { @checker.save! }.should raise_error;
28
+        @checker.options['expected_update_period_in_days'] = "2"
29
+        @checker.options['uniqueness_look_back'] = "nonsense"
30
+        lambda { @checker.save! }.should raise_error;
31
+        @checker.options['mode'] = "nonsense"
32
+        lambda { @checker.save! }.should raise_error;
33
+        @checker.options = @site
34
+      end
35
+    
24 36
       it "should check for changes (and update Event.expires_at)" do
25 37
         lambda { @checker.check }.should change { Event.count }.by(1)
26 38
         event = Event.last
@@ -107,7 +119,7 @@ describe Agents::WebsiteAgent do
107 119
           'expected_update_period_in_days' => 2,
108 120
           'type' => "html",
109 121
           'url' => "http://xkcd.com",
110
-          'mode' => :on_change,
122
+          'mode' => "on_change",
111 123
           'extract' => {
112 124
             'url' => {'css' => "#topLeft a", 'attr' => "href"},
113 125
             'title' => {'css' => "#topLeft a", 'text' => "true"}